require(tidyverse)
require(lmtest)
require(data.table)
require(plm)
require(stargazer)
require(lubridate)
require(ivreg)
require(DescTools)
require(rsq)
require(miceadds)
require(jtools)
require(stats)
require(haven)
require(sandwich)
require(fastDummies)
require(sqldf)
require(scales)
require(stringr)
require(MatchIt)
require(modelsummary)
options(modelsummary_format_numeric_latex = "plain")
require(readxl)
library(purrr)

############################# Data Preparation - Last Annual Earnings Guidance #############################
# Set Working Directory
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))

#### Load IBES Guidance data
guidance <- read.csv("ibes_guidance.csv")

### Initial Filters ###: Keep annual EPS forecasts in USD
# Annual Forecasts
guidance <- subset(guidance, pdicity == "ANN")
# EPS Forecasts
guidance <- subset(guidance, measure == "EPS")
# Forecasts in USD
guidance <- subset(guidance, curr == "USD")
# Point and range forecasts
guidance_range <- subset(guidance, range_desc == 1)
guidance_range$eps_f_range <- 1
guidance_point <- subset(guidance, range_desc == 2)
guidance_point$eps_f_range <- 0
rm(guidance)
# Define EPS forecasts (midpoint for range forecasts)
guidance_range$eps_f <- (guidance_range$val_1 + guidance_range$val_2) / 2
guidance_point$eps_f <- guidance_point$val_1
guidance <- rbind(guidance_point, guidance_range)
rm(guidance_range, guidance_point)
# Rename Guidance Date
guidance <- guidance %>%   rename(guidance_date = anndats)

#### Load IBES Actuals data
actuals <- read.csv("ibes_actuals.csv")
# Ensure that variables are consistently in lower case
names(actuals) <- tolower(names(actuals))
# Apply same filters: EPS, annual, USD
actuals <- subset(actuals, measure == "EPS")
actuals <- subset(actuals, pdicity == "ANN")
actuals <- subset(actuals, curr_act == "USD")

# Rename Earnings Announcement Date and Realized EPS
actuals <- actuals %>% rename(ea_date = anndats, eps_r = value)

# Get date variables for match
actuals$prd_yr <- substr(actuals$pends, 1, 4)
actuals$prd_mon <- substr(actuals$pends, 6, 7)
actuals$prd_mon <- str_remove(actuals$prd_mon, "^0+")
actuals$fyear_end <- as.Date(actuals$pends)
actuals <- actuals %>% select(ticker, cusip, oftic, fyear_end, eps_r, ea_date, prd_yr, prd_mon)
  guidance <- merge(guidance, actuals, by = c("ticker","prd_yr", "prd_mon"))
rm(actuals)
guidance <- guidance %>% rename(ncusip = cusip)

# For subsequent merges based on monthly data: define the month before the guidance date
guidance$guidance_date <- as.Date(guidance$guidance_date) 
guidance$previous_month <- guidance$guidance_date %m-% months(1)
guidance$month <- format(guidance$previous_month, "%Y%m")
guidance$previous_month <- NULL

### Import University of Michigan Consumer Sentiment data (based on monthly file)
sentiment <- read.csv("u_of_m_index.csv")
month_lookup <- c("January" = "01", "February" = "02", "March" = "03", "April" = "04",
                  "May" = "05", "June" = "06", "July" = "07", "August" = "08",
                  "September" = "09", "October" = "10", "November" = "11", "December" = "12")
sentiment$numeric_month <- month_lookup[sentiment$Month]
rm(month_lookup)
sentiment$month <- paste0(sentiment$YYYY, sentiment$numeric_month)
sentiment <- sentiment %>% select(month, ICS_ALL)
sentiment <- sentiment %>% rename(sentiment = ICS_ALL)

guidance <- merge(guidance, sentiment, by = "month")
rm(sentiment)

### Import pre-processed stock market data
stocks <- read.csv("data_for_further_analyses.csv", header=FALSE)
stocks <- stocks %>% 
  rename(permno = V1,
         month = V2,
         date = V3,
         size = V4,
         stock_issues = V5,
         beta = V6,
         ivol = V7,
         sue = V8,
         sue_date = V9,
         short = V10,
         turnover = V11,
         bm = V12,
         op = V13,
         ep = V14,
         disp = V15,
         analysts = V16,
         vola = V17,
         ret1m = V18,
         ret2m = V19,
         ret3m = V20,
         ret1yr = V21,
         ret2yr = V22,
         ret3yr = V23,
         ret4yr = V24,
         ret5yr = V25,
         cgo = V26,
         percgain = V27,
         cgo_placebo = V28,
         cgo_daily = V29
  )

linking_table <- read.csv("ncsuip-permno-cusip-merge.csv")
names(linking_table) <- tolower(names(linking_table))
stocks <- merge(stocks, linking_table, by = c("permno","date"))
rm(linking_table)

guidance <- merge(guidance,stocks, by = c("ncusip","month")) 
rm(stocks)

guidance_car <- read.csv("guidance_car_permno.csv")
guidance_car <- guidance_car %>% select(permno, evtdate, car)
guidance_car <- guidance_car %>% rename(
  guidance_date = evtdate,
  guidance_car = car
)
guidance <- merge(guidance, guidance_car, by = c("permno","guidance_date"))
rm(guidance_car)

ea_car <- read.csv("ea_car_permno.csv")
ea_car <- ea_car %>% select(permno, evtdate, car)
ea_car <- ea_car %>% rename(
  ea_date = evtdate,
  ea_car = car
)
guidance <- merge(guidance, ea_car, by = c("permno","ea_date"))
rm(ea_car)

compustat <- read.csv("crsp_compustat.csv")
names(compustat) <- tolower(names(compustat))
compustat <- compustat %>% arrange(gvkey, datadate) %>% mutate(price = prcc_f / adjex_f) %>%  group_by(gvkey) %>%mutate(price_prev = lag(price)) %>% ungroup()
compustat$prd_yr <- substr(compustat$datadate, 1, 4)
compustat$prd_mon <- substr(compustat$datadate, 6, 7)
compustat$prd_mon <- str_remove(compustat$prd_mon, "^0+")
compustat <- compustat %>% rename(permno = lpermno, cusip_9 = cusip)
compustat$cusip <- substr(compustat$cusip_9, 1,8)
compustat <- compustat %>% select(permno, gvkey, prd_yr, prd_mon, price_prev, sic)
guidance <- merge(guidance, compustat, by = c("permno","prd_yr","prd_mon"))
rm(compustat)

### Apply final filters
# Limit to guidance within 365 days of firm-year end
guidance$horizon <- as.numeric(guidance$fyear_end - guidance$guidance_date)
guidance <- subset(guidance, horizon >= 1)
guidance <- subset(guidance, horizon < 365)

# Exclude utilities and regulated industries (SIC 4900 - 4949)
guidance <- subset(guidance, sic < 4900 | sic > 4949)
# Exclude financial services (SIC 6000 - 6999)
guidance <- subset(guidance, sic < 6000 | sic > 6999)

# Focus on firm-year obs with more than one guidance instance
guidance <- guidance %>% 
  group_by(permno, prd_yr) %>%
  mutate(obs_count = n()) %>%
  ungroup()

guidance <- subset(guidance, obs_count > 1)

# Last Forecast by Year 
guidance <- guidance %>%
  group_by(permno, fyear_end) %>%
  slice_max(guidance_date, with_ties = TRUE) %>%
  ungroup()

# If there are several forecasts per firm-day: Keep last modification
guidance <- guidance %>%
  arrange(permno, fyear_end, mod_date, mod_time) %>%
  group_by(permno, fyear_end) %>%
  slice_tail(n = 1) 

# Date restriction:
guidance <- subset(guidance, year(guidance_date) >= 2001 & year(guidance_date) <= 2023)

# Drop penny stocks
guidance <- subset(guidance, price_prev >= 10)

### Calculate variables in final dataset
# Scale Up Dependent Variables
guidance$bias <- ((guidance$eps_f - guidance$eps_r) / guidance$price_prev) * 100
guidance$ea_car <- guidance$ea_car * 100
guidance$guidance_car <- guidance$guidance_car * 100

# Log of size and horizon 
guidance$size <- log(1000 * guidance$size)
guidance$horizon <- log(guidance$horizon)

# Loss Dummy
guidance$loss_d <- ifelse(guidance$ep < 0, 1,0)

# Define process risk based on SIC
guidance$process <- ifelse(guidance$sic >= 2833 & guidance$sic <= 2836 | guidance$sic >= 8731 & guidance$sic <= 8734 | guidance$sic >= 3570 & guidance$sic <= 3577 | guidance$sic >= 7371 & guidance$sic <= 7379 | guidance$sic >= 3600 & guidance$sic <= 3674 | guidance$sic >= 5200 & guidance$sic <= 5961, 1, 0)

# Industry dummies based on two-digit SIC code
guidance$sic_two <- factor(substr(guidance$sic, 1, 2))

# Year dummies
guidance$year <- factor(guidance$prd_yr)

# Firm dummies
guidance$permno <- factor(guidance$permno)

# Net Equity Issuer
guidance$stock_issues_d <- ifelse(guidance$stock_issues > 0, 1, 0)

# Generate the lagged bias
setDT(guidance)
guidance[, prev_bias := {
  b_lag <- shift(bias)
  y_lag <- shift(prd_yr)
  fifelse(prd_yr == y_lag + 1L & !is.na(b_lag), b_lag, 0)
}, by = permno]

# Drop if missing main dependent or independent variable
guidance <- subset(guidance, !is.na(bias) & !is.na(cgo) & !is.na(guidance_car) & !is.na(ea_car))

### Winsorization - winsorize all continuous variables at 1% level
winsor_vars <- c("bias","guidance_car","ea_car","cgo","beta","size","bm","horizon","prev_bias","op","analysts","disp","ivol","stock_issues","sentiment","turnover","short","vola")
guidance <- guidance %>%
  mutate(across(
    all_of(winsor_vars),
    ~ {
      qs <- quantile(.x, probs = c(0.01, 0.99), na.rm = TRUE, names = FALSE)
      DescTools::Winsorize(.x, val = qs)
    }
  ))


### Define different sets of controls
controls_1 <- c("beta","size","bm","sic_two","year")
controls_2 <- c(controls_1, "horizon","loss_d","process","prev_bias")
controls_3 <- c(controls_2, "op","analysts","disp","vola","stock_issues_d","sentiment")
controls_4 <- c("beta","size","bm","horizon","loss_d","prev_bias","op","analysts","disp","vola","stock_issues_d","sentiment","permno","year")

controls <- list(
  controls_1 = controls_1,
  controls_2 = controls_2,
  controls_3 = controls_3,
  controls_4 = controls_4
)

############################# Data Analysis - Last Annual Earnings Guidance #############################
############################# Table A6 #############################

c1 <- lm(bias ~ cgo + sic_two + year, data = guidance)
se_1 <- sqrt(diag(vcovCL(c1, cluster = ~ permno + year)))

c2 <- lm(data = guidance, update(bias ~ cgo, as.formula(paste(". ~ . +", paste(controls_1, collapse = " + ")))))
se_2 <- sqrt(diag(vcovCL(c2, cluster = ~ permno + year)))

c3 <- lm(data = guidance, update(bias ~ cgo, as.formula(paste(". ~ . +", paste(controls_2, collapse = " + ")))))
se_3 <- sqrt(diag(vcovCL(c3, cluster = ~ permno + year)))

c4 <- lm(data = guidance, update(bias ~ cgo, as.formula(paste(". ~ . +", paste(controls_3, collapse = " + ")))))
se_4 <- sqrt(diag(vcovCL(c4, cluster = ~ permno + year)))

c5 <- lm(data = guidance, update(bias ~ cgo, as.formula(paste(". ~ . +", paste(controls_4, collapse = " + ")))))
se_5 <- sqrt(diag(vcovCL(c5, cluster = ~ permno + year)))

vcov_fun <- function(x) vcovCL(x, cluster = ~ permno + year)
models <- list("(1)" = c1, "(2)" = c2, "(3)" = c3,"(4)" = c4,"(5)" = c5)

modelsummary(models,  vcov = vcov_fun, coef_omit  = "year|sic_two|permno", estimate   = "{estimate}{stars}", statistic  = "({statistic})", stars      = c("*" = 0.10, "**" = 0.05, "***" = 0.01), gof_omit   = "AIC|BIC|Log.Lik", fmt = 4, output = "latex", title = "Guidance Bias and CGO", escape  = FALSE)
rm(c1, c2, c3, c4, c5, se_1, se_2, se_3, se_4, se_5)

